Cream of the Crop 3

home *** CD-ROM | disk | FTP | other *** search

/ Cream of the Crop 3 / Cream of the Crop 3.iso / utility / secdrv.zip / CRYPT2.ASM < prev next >

Wrap

Assembly Source File | 1993-11-19 | 20KB | 763 lines

%PAGESIZE 59 ; Turbo assembler formatting codes %BIN 13 %LINUM 3 ; Copyright (c) 1993 Colin Plumb. This code may be freely ; distributed under the terms of the GNU General Public Licence. ; Internet <colin@nyx.cs.du.edu> .model large .code ; A core operation in IDEA is multiplication modulo 65537. ; The valid inputs, 1 through 66636 inclusive are represented in ; 16-bit registers modulo 65536. I.e. a value of 0 means 65536, ; or -1. Thus, we need to test for that specially. -x, modulo ; 65537, is 65537-x = 1-x. ; For any other number, represent the product as a*65536+b. Since ; 65536 = -1 (mod 65537), this is the same number as b-a. Should ; this result be negautive (generate a borrow), -n mod 65537 = 1-n ; mod 65536. Or in other words, if you add the borrow bit back on, ; you get the right answer. ; This is what the assembly code does. It forms a zero, and adds ; that on with carry. ; Another useful optimisation takes advantage of the fact that ; a and b are equal only if the answer is congruent to 0 mod 65537. ; Since 65537 is prime, this happens only if one of the inputs is ; congruent to 0 mod 65537. Since the inputs are all less than 65537, ; this means it must have been zero. ; The code below tests for a zero result of the subtraction, and if ; one arises, it branches out of line to figure out what happened. ; This code implemets the IDEA encryption algorithm. ; It follows in pseudo-C, where the * operator operates ; modulo 65537, as Idea needs. (If you don't understand, ; learn IDEA better.) ; IDEA is works on 16-bit units. If you're processing bytes, ; it's defined to be big-endian, so an Intel machine needs to ; swap the bytes around. ; void Idea(u_int16 *in, u_int16 *out, u_int16 *key) ; { ; register u+int16 x0, x1, x2, x3, s1, s2, round; ; ; x0 = *in++; x1 = *in++; x2 = *in++; x3 = *in; ; ; for (round = 0; round < 8; round++) { ; x0 *= *key++; ; x1 += *key++; ; x2 += *key++; ; x3 *= *key++; ; ; s1 = x1; s2 = x2; ; x2 ^= x0; x1 ^= x3; ; ; x2 *= *key++; ; x1 += x2; ; x1 *= *key++; ; x2 += x1; ; ; x0 ^= x1; x3 ^= x2; ; x1 ^= s2; x2 ^= s1; ; } ; *out++ = x0 * *key++; ; *out++ = x2 + *key++; /* Yes, this is x2, not x1 */ ; *out++ = x1 + *key++; ; *out = x3 * *key; ; } ; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp ; Trashes *all* registers. direction flag must be clear. ; Leaves es zero. ; Since there is no spare register to hold the loop count, I make ; clever use of the stack, pushing the start of the loop several ; times and using a ret instruction to do the return. ; Annoyingly, lods is fastest on 8086's, but other techniques are ; best on 386's. Well, that's what the manual says, but real ; life is different. USELODS wins on a 386SX, at least. ; Leave it set for all platforms. USELODS equ 1 ; bp must be x0 for some of the code below to work x0 equ bp x1 equ bx x2 equ cx x3 equ di ; di must be x3 for some of the code below to work ;; Now, this is rather interesting. We test for zero arguments ;; after the multiply. Assuming random inputs, one or both are ;; zero (2^17-1)/2^32, or approximately 1/32786 of the time. ;; Encryption in any feedback mode produces essentially random ;; inputs, so average-case analysis is okay. While we don't ;; want the out-of-line code to waste time, it is not worth ;; slowing down the in-line case to speed it up. ;; ;; Basically, we start inverting the source x, and if that was 0, ;; we use the inverse of the key instead. Core1Z: neg x0 jnz Core1Za if USELODS sub x0,[si-2] else sub x0,[si] endif Core1Za: inc x0 jmp Core1done Core2Z: neg x3 jnz Core2Za if USELODS sub x3,[si-2] else sub x3,[si+6] endif Core2Za: inc x3 jmp Core2done Core3Z: neg x2 jnz Core3Za if USELODS sub x2,[si-2] else sub x2,[si+8] endif Core3Za: inc x2 jmp Core3done Core4Z: neg x1 jnz Core4Za if USELODS sub x1,[si-2] else sub x1,[si+10] endif Core4Za: inc x1 jmp Core4done ; We need a constant 0 that we can move into a register without affecting ; the carry flag (as the classic xor ax,ax is wont to do), so we use the ; es register for a constant 0 source. This is okay even in protected ; mode. (I *told* you this was tricky code!) ; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions. Core proc near xor ax,ax mov es,ax mov ax,OFFSET Finish push ax mov ax,OFFSET Coreloop push ax ; Loop 3 times, then return push ax push ax Coreloop: if USELODS lodsw else mov ax,[si] ; x0 *= *key++ endif mul x0 sub ax,dx jz Core1Z mov x0,es adc x0,ax Core1done: if USELODS lodsw add x1,ax lodsw add x2,ax else add x1,[si+2] ; x1 += *key++ add x2,[si+4] ; x2 += *key++ endif if USELODS lodsw else mov ax,[si+6] ; x3 += *key++ endif mul x3 sub ax,dx jz Core2Z mov x3,es adc x3,ax Core2done: push x1 ; s1 = x1 push x2 ; s2 = x2 xor x1,x3 ; x1 ^= x3 xor x2,x0 ; x2 ^= x0 if USELODS lodsw else mov ax,[si+8] ; x2 *= *key++ endif mul x2 sub ax,dx jz Core3Z mov x2,es adc x2,ax Core3done: add x1,x2 ; x1 += x2 if USELODS lodsw else mov ax,[si+10] ; x1 *= *key++ endif mul x1 sub ax,dx jz Core4Z mov x1,es adc x1,ax Core4done: add x2,x1 ; x2 += x1 xor x0,x1 ; x0 ^= x1 xor x3,x2 ; x3 ^= x2 pop dx xor x1,dx ; x1 ^= s2 pop dx xor x2,dx ; x2 ^= s1 ; Second unrolling of loop if USELODS lodsw else mov ax,[si+12] ; x0 *= *key++ endif mul x0 sub ax,dx jz Core5Z mov x0,es adc x0,ax Core5done: if USELODS lodsw add x1,ax lodsw add x2,ax else add x1,[si+14] ; x1 += *key++ add x2,[si+16] ; x2 += *key++ endif if USELODS lodsw else mov ax,[si+18] ; x3 *= *key++ endif mul x3 sub ax,dx jz Core6Z mov x3,es adc x3,ax Core6done: push x1 ; s1 = x1 push x2 ; s2 = x2 xor x1,x3 ; x1 ^= x3 xor x2,x0 ; x2 ^= x0 if USELODS lodsw else mov ax,[si+20] ; x2 *= *key++ endif mul x2 sub ax,dx jz Core7Z mov x2,es adc x2,ax Core7done: add x1,x2 ; x1 += x2 if USELODS lodsw else mov ax,[si+22] ; x1 *= *key++ endif mul x1 sub ax,dx jz Core8Z mov x1,es adc x1,ax Core8done: add x2,x1 ; x2 += x1 xor x0,x1 ; x0 ^= x1 xor x3,x2 ; x3 ^= x2 pop dx xor x1,dx ; x1 ^= s2 pop dx xor x2,dx ; x2 ^= s1 ife USELODS lea si,[si+24] endif ret ; Used as a loop instruction! Core5Z: neg x0 jnz Core5Za if USELODS sub x0,[si-2] else sub x0,[si+12] endif Core5Za: inc x0 jmp Core5done Core6Z: neg x3 jnz Core6Za if USELODS sub x3,[si-2] else sub x3,[si+18] endif Core6Za: inc x3 jmp Core6done Core7Z: neg x2 jnz Core7Za if USELODS sub x2,[si-2] else sub x2,[si+20] endif Core7Za: inc x2 jmp Core7done Core8Z: neg x1 jnz Core8Za if USELODS sub x1,[si-2] else sub x1,[si+22] endif Core8Za: inc x1 jmp Core8done Core9Z: neg x0 jnz Core9Za if USELODS sub x0,[si-2] else sub x0,[si] endif Core9Za: inc x0 jmp Core9done ; Special: compute into dx (zero on entry) Core10Z: sub dx,x3 jnz Core10Za if USELODS sub dx,[si-2] else sub dx,[si+6] endif Core10Za: inc dx ; jmp Core10done ret Finish: if USELODS lodsw else mov ax,[si] ; x0 *= *key++ endif mul x0 sub ax,dx jz Core9Z mov x0,es adc x0,ax Core9done: xchg x1,x2 if USELODS lodsw add x1,ax lodsw add x2,ax else add x1,[si+2] ; x1 += *key++ add x2,[si+4] ; x2 += *key++ endif ; This is special: compute into dx, not x3 if USELODS lodsw else mov ax,[si+6] ; x3 *= *key++ endif mul x3 sub ax,dx mov dx,es jz Core10Z adc dx,ax Core10done: ret endp ; Args are in, out, key public _Idea2 _Idea2 proc far cld push bp ; Args start at [bp+6] mov bp,sp push si push di push ds ; 6 more words here, so args are at [sp+12] lds si,[bp+6] ; in lodsw xchg ah,al mov dx,ax lodsw xchg ah,al mov x1,ax lodsw xchg ah,al mov x2,ax lodsw xchg ah,al mov x3,ax lds si,[bp+14] ; key mov x0,dx call Core mov ax,x0 mov bp,sp les di,[bp+16] xchg ah,al stosw mov ax,x1 xchg ah,al stosw mov ax,x2 xchg ah,al stosw mov ax,dx xchg ah,al stosw pop ds pop di pop si pop bp ret endp ; Okay, the basic plan for the CFB kernel is ; get x0,x1,x2,x3 ; get key pointer ; call core ; get buffer pointers ;Loop: ; lodsw ; xor ax,x0 ; mov x0,ax ; stosw ; lodsw ; xor ax,x1 ; mov x0,ax ; stosw ; lodsw ; xor ax,x2 ; mov x0,ax ; stosw ; lodsw ; xor ax,x3 ; mov x3,ax ; stosw ; push buffer pointers ; get key pointer ; call core ; pop buffer pointers ; loop ; lodsw/xor/etc. ; ; ; This function is designed to go in the middle of a byte-granularity ; CFB engine. It performs "len" encryptions of the IV, encrypting ; 8*(len-1) bytes from the source to the destination. The idea is ; that you first xor any odd leading bytes, then call this function, ; then xor up to 8 trailing bytes. ; The main loop in this is 38 instructions, plus the 336 for the core ; makes 374 total. That's 46.75 instructions per byte. ; (It's the same for IdeaCFBx) ; IV, key, plain, cipher, len public _IdeaCFB _IdeaCFB proc far ; Args are at [sp+4] cld push bp push si push di push ds ; 8 more words here, so args are at [sp+12] ; To be precise, IV is at 12, key at 16, plain at 20, ; cipher at 24 and len at 28 mov bp,sp lds si,[bp+12] ; IV ; Load and byte-swap IV mov ax,[si] xchg ah,al mov x1,[si+2] mov x2,[si+4] xchg bh,bl xchg ch,cl mov dx,[si+6] xchg dh,dl lds si,[bp+16] ; Key mov x0,ax mov x3,dx call Core IdeaCFBLoop: ; mov ax,x0 ; mov bp,sp ; dec WORD PTR [bp+28] ; Decrement count ; jz IdeaCFBEnd ; lds si,[bp+20] ; les di,[bp+24] ; mov x0,ax ; Alternate code: (which is faster? Two moves or three segment overrides?) mov si,sp dec WORD PTR ss:[si+28] jz IdeaCFBEnd les di,ss:[si+24] lds si,ss:[si+20] lodsw xchg ah,al xor ax,x0 mov x0,ax xchg ah,al stosw lodsw xchg ah,al xor ax,x1 mov x1,ax xchg ah,al stosw lodsw xchg ah,al xor ax,x2 mov x2,ax xchg ah,al stosw lodsw xchg ah,al xor ax,dx mov dx,ax xchg ah,al stosw ; mov ax,x0 ; mov bp,sp ; mov [bp+20],si ; Save source offset ; mov [bp+24],di ; Save destination offset ; lds si,[bp+16] ; Key ; mov x0,ax ; Get x0 in place for another iteration ; Alternate code for the above: (which is faster? One move or three ss:?) mov ax,si mov si,sp mov ss:[si+20],ax mov ss:[si+24],di lds si,ss:[si+16] mov x3,dx ; Get x3 in place mov ax,OFFSET IdeaCFBLoop push ax jmp Core IdeaCFBEnd: ; lds si,[bp+12] lds di,ss:[si+12] ; Get IV for writing back mov ax,x0 xchg ah,al mov [di],ax ; Use stosw? xchg bh,bl xchg ch,cl mov [di+2],x1 mov [di+4],x2 xchg dh,dl mov [di+6],dx pop ds pop di pop si pop bp ret endp ; This decoding step is similar, except that instead of ; lods ; xor x0,ax ; mov ax,x0 ; stos ; the feedback step is ; lods ; xchg x0,ax ; xor ax,x0 ; stos ; IV, key, cipher, plain, len public _IdeaCFBx _IdeaCFBx proc far ; Args are at [sp+4] cld push bp push si push di push ds ; 8 more words here, so args are at [sp+12] mov bp,sp lds si,[bp+12] ; IV ; Load and byte-swap IV mov ax,[si] xchg ah,al mov x1,[si+2] mov x2,[si+4] xchg bh,bl xchg ch,cl mov dx,[si+6] xchg dh,dl lds si,[bp+16] ; Key mov x0,ax mov x3,dx call Core IdeaCFBxLoop: ; mov ax,x0 ; mov bp,sp ; dec WORD PTR [bp+28] ; Decrement count ; jz IdeaCFBxEnd ; lds si,[bp+20] ; les di,[bp+24] ; mov x0,ax ; Alternate code: (which is faster? Two moves or three segment overrides) mov si,sp dec WORD PTR ss:[si+28] jz IdeaCFBxEnd les di,ss:[si+24] lds si,ss:[si+20] lodsw xchg ah,al xchg x0,ax xor ax,x0 xchg ah,al stosw lodsw xchg ah,al xchg x1,ax xor ax,x1 xchg ah,al stosw lodsw xchg ah,al xchg x2,ax xor ax,x2 xchg ah,al stosw lodsw xchg ah,al xchg dx,ax xor ax,dx xchg ah,al stosw ; mov ax,x0 ; mov bp,sp ; mov [bp+20],si ; Save source offset ; mov [bp+24],di ; Save destination offset ; lds si,[bp+16] ; Key ; mov x0,ax ; Get x0 in place for another iteration ; Alternate code for the above: (which is faster? One move or three ss:?) mov ax,si mov si,sp mov ss:[si+20],ax mov ss:[si+24],di lds si,ss:[si+16] mov x3,dx ; Get x3 in place mov ax,OFFSET IdeaCFBxLoop push ax jmp Core IdeaCFBxEnd: ; lds si:[bp+12] lds di,ss:[si+12] ; Get IV for writing back mov ax,x0 xchg ah,al mov [di],ax ; Use stosw? xchg bh,bl xchg ch,cl mov [di+2],x1 mov [di+4],x2 xchg dh,dl mov [di+6],dx pop ds pop di pop si pop bp ret endp end